Morning data for 2020-04-07 Disclaimers:
import requests
from matplotlib import pyplot as plt
import csv
from datetime import datetime
from datetime import timedelta, date
states_url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv'
cases_url = "https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
deaths_url = "https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
with requests.get(states_url, stream=True) as r:
text = r.iter_lines(decode_unicode='utf-8')
reader = csv.reader(text, delimiter=',')
states_rows = []
for row in reader:
states_rows.append(row)
with requests.get(cases_url, stream=True) as r:
text = r.iter_lines(decode_unicode='utf-8')
reader = csv.reader(text, delimiter=',')
cases_rows = []
for row in reader:
cases_rows.append(row)
with requests.get(deaths_url, stream=True) as r:
text = r.iter_lines(decode_unicode='utf-8')
reader = csv.reader(text, delimiter=',')
deaths_rows = []
for row in reader:
deaths_rows.append(row)
def get_data(cases_rows, deaths_rows, states_rows):
"""
cases_rows::rows of cases for each province, country in Johns Hopkins dataset
deaths_rows::rows of deaths for each province, country in Johns Hopkins dataset
states_rows::rows of date, state, cases, deaths from NYT dataset
returns
dates[locale]::array of dates for which we have data for a locale
cases[locale]::array of cases for each date in dates
deaths[locale]::array of deaths for each date in dates
where locale is a tuple in the format of (state, )
"""
locales = {(row[0],row[1]) for row in cases_rows[1:]}
locales |= {(row[0],row[1]) for row in deaths_rows[1:]}
locales |= {(row[1],'US') for row in states_rows}
cases_header = cases_rows[0]
deaths_header = deaths_rows[0]
nyt_header = states_rows[0]
assert len(cases_header)==len(deaths_header), "cases and deaths not of equal length"
cases, deaths, dates = {},{},{}
for locale in locales:
cases[locale] = []
deaths[locale] = []
dates[locale] = []
for item in cases_header[4:]:
dates[locale].append(datetime.strptime(item,
"%m/%d/%y").date())
for row in cases_rows[1:]:
locale = (row[0],row[1])
for item in row[4:]:
cases[locale].append(int(item))
for row in deaths_rows[1:]:
locale = (row[0],row[1])
for item in row[4:]:
deaths[locale].append(int(item))
# create date zero, and set all values to zero
first_date = datetime.strptime(cases_header[4],
"%m/%d/%y").date()
us_state_locales = {locale for locale in locales if
locale[1]=='US' and locale[0] is not ''}
for state in us_state_locales:
dates[state]=[first_date]
cases[state].append(0)
deaths[state].append(0)
first_date = first_date + timedelta(days=1)
last_date = datetime.strptime(states_rows[-1][0],
"%Y-%m-%d").date()
def date_iter(start_date, end_date):
"""
start_date::first day of returned generator
end_date::final day of returned generator
yield dates start with start_date until end_date one
at a time
"""
curr_date = start_date
while curr_date < end_date:
yield curr_date
curr_date = curr_date + timedelta(days = 1)
yield curr_date
return
for curr_day in date_iter(first_date, last_date):
# new date. Copy over last date's data
for state in us_state_locales:
dates[state].append(curr_day)
try:
cases[state].append(cases[state][-1])
deaths[state].append(deaths[state][-1])
except IndexError:
# list was empty, start with zero
assert len(cases[state])==0, "length should be zero"
assert len(deaths[state])==0, "length should be zero"
cases[state].append(0)
deaths[state].append(0)
# now go through dataset and whenever date matches
# current date, fix data so it isn't a simple copy
# of the previous day's data
for row in states_rows[1:]:
row_date = datetime.strptime(row[0],
"%Y-%m-%d").date()
if row_date == curr_day:
locale = (row[1],'US')
cases[locale][-1]=int(row[3])
deaths[locale][-1]=int(row[4])
return dates, cases, deaths, locales
dates, cases, deaths, locales = get_data(cases_rows, deaths_rows, states_rows)
def discrete_growth_fraction(data):
result = []
for elt1, elt2 in zip(data[:-1],data[1:]):
if (elt1 + elt2) == 0:
result.append(0)
elif elt2 < 10: # ignore very early growth
result.append(0)
else:
result.append((elt2 - elt1)*2/(elt1 + elt2))
return result
def plot_growth_fraction(locale, plot_type):
"""
locale::location to track in (province, country) form
plot_type::'cases' or 'deaths'
"""
if plot_type == 'cases':
plt.plot(discrete_growth_fraction(cases[locale]))
plt.ylabel('cases growth fraction')
elif plot_type == 'deaths':
plt.plot(discrete_growth_fraction(deaths[locale]))
plt.ylabel('deaths growth fraction')
plt.title(locale[1])
plt.xlabel('sample (day)')
plt.show()
def moving_average(data, window):
"""
data::array to be averaged
window::integer length to average over
"""
result=[]
for idx in range(len(data) - window + 1):
result.append(sum(data[idx:idx+window])/window)
return result
assert moving_average([1,3,5,7],1)==[1,3,5,7]
assert moving_average([1,3,5,7],2)==[2,4,6]
assert moving_average([1],1)==[1]
def plot_growth_with_total(locale, window = 1):
"""
locale::in format ('province','country'), leave province '' if none
plot_type::'cases' or 'deaths'
window::moving window to average over
"""
plot_types = {'cases', 'deaths'}
data = {}
data['cases'] = cases[locale]
data['deaths'] = deaths[locale]
processed_data = {}
t = dates[locale]
for plot_type in plot_types:
growth_data = discrete_growth_fraction(data[plot_type])
avg_data = moving_average(growth_data, window)
data1 = [0 for _ in range(window)]
data1.extend(avg_data)
processed_data[plot_type] = data1
fig, ax1 = plt.subplots()
color = 'tab:blue'
if locale[0] != '':
title = locale[0] + ', ' + locale[1]
else:
title = locale[1]
ax1.set_title(title)
ax1.set_xlabel('day')
ax1.set_ylabel('growth', color=color)
growth_lines={}
for plot_type,color in [('cases','tab:green'),('deaths','tab:blue')]:
growth_lines[plot_type],=ax1.plot(t, processed_data[plot_type], color= color, label=plot_type + ' growth')
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.legend(loc='upper left')
ax1.grid(axis = 'y')
ax1.set_yticks([0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5])
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
total_lines={}
ax2.set_ylabel('total') # we already handled the x-label with ax1
ax2.set_yscale('log')
for plot_type,color in [('cases','tab:orange'),('deaths','tab:red')]:
total_lines[plot_type],=ax2.plot(t, data[plot_type], color=color, label=plot_type + ' totals')
ax2.tick_params(axis='y', labelcolor= 'tab:red')
ax2.legend(loc='center left')
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.show()
# final output section
sorted_locales = sorted(list(locales),key=lambda x: x[1] + x[0])
for locale in sorted_locales:
if locale[0] != '':
print(locale[0] + ", " + locale[1])
else:
print(locale[1])
if max(cases[locale]) > 100 and max(deaths[locale]) > 10:
plot_growth_with_total(locale, window = 7)